
# NOTE: path.results is referenced from global environment! path.results is only used for generating legend. I consider
#       this to be temporary so didn't fix the global env issue. Ultimately, I plan to generate and pass back legend graphics
#       to main and leave it to main to right to the path.results.

# this is a wrapper to mimic "load.and.clean.data" function, which takes the data and metadata input
# and put them in the proper form (if it can or else generates error). The main three tasks are
# 1- construct.data: loads count (raw or processed) data and construct the data object (see construct.data for details)
# 2- construct.sample.annotation: loads sample annotations and construct the smp.annot object (see. construct.sample annotation for details)
# 3- construct.probe.annotation: loads probe annotations and construct the prb.annot object (see construct.probe.annotation for details)

construct.data.and.annotations <- function(path.data,
                                           path.sampleannot,
                                           path.probeannot,
                                           prb.annotation.id.column,             # <------------ This may become static as opposed to a func input "UniqueID"
                                           prb.annotation.KEGG.id.column = NULL, # <------------ This may become static as opposed to a func input "KEGG.Pathways"
                                           prb.annotation.KEGG.Gene.Name = NULL, # <------------ This may become static as opposed to a func input "Gene.Name"
                                           prb.annotation.celltype.id.column = NULL, # <------------ This may become static as opposed to a func input "Cell.Type"
                                           prb.set.column = NULL,
                                           min.prb.set.size,
                                           min.KEGG.set.size,
                                           sample.annotation.id.column,   
                                           sampleannot.variables,
                                           sampleannot.variabletypes,
                                           sampleannot.referencelevels,
                                           exclude.probes = NULL,    #<-------------- This could be ERCC by default
                                           exclude.samples = NULL,   #<-------------- This could be NULL by default
                                           draw.color.legend,
                                           min.samples,
                                           plottypearg,
                                           log){
  
  # Initiate
  #---------
  warnings.paragraph = ""
  
  
  # Load and construct data
  #------------------------
  cat("loading data",file=log,sep='\n\n',append=TRUE)
  
  # Load up all existing data types (i.e. raw, normalized, background subtracted)
  # Assumption: raw file always exists. It errors out if rawData.csv doesn't exist.
  # Also if other files don't match raw file dimnames they are disregarded and NULL
  # is returend for the dimnmanes-mismatched files.
  #------------------------------------------------------------------------------
  raw <- processed <- backgroundSubtracted <- smp.names <- prb.names <- NULL
  
  #RAW
  if(file.exists(path.data[2])){
    colnum <- ncol(read.csv(path.data[2]))
    raw <- read.csv(path.data[2],row.names=1,colClasses = c("character",rep("numeric",colnum-1)),check.names = F)
    raw <- construct.data(dat = raw,min.samples = min.samples,dat.type = "raw", exclude.samples, exclude.probes)
    if(!is.null(raw)){
      smp.names <- rownames(raw)
      prb.names <- colnames(raw)
    }
  }else{
    stop("rawData is not provided")
  }
  
  #Background Subtracted: This is the data with background thresholded to zero
  if(file.exists(path.data[3])){
    colnum <- ncol(read.csv(path.data[3]))
    backgroundSubtracted.tmp <- read.csv(path.data[3],row.names=1,colClasses = c("character",rep("numeric",colnum-1)),check.names = F)
    if(all(smp.names %in% rownames(backgroundSubtracted.tmp)) & all(prb.names %in% colnames(backgroundSubtracted.tmp))){
      backgroundSubtracted.tmp <- raw - backgroundSubtracted.tmp[smp.names,prb.names]
      backgroundSubtracted.tmp[backgroundSubtracted.tmp<0] <- 0
      backgroundSubtracted <- construct.data(dat = backgroundSubtracted.tmp,min.samples = min.samples,dat.type = "background.subtracted", exclude.samples, exclude.probes)
    }else{
      wtmp <- "rows and/or columns of background subtracted data file don't match the raw data file. The analysis will use only raw data"
      warning(wtmp)
      paste(warnings.paragraph,wtmp)
    }

  }
  
  #Processed: This is the data which may be normalized and/or background subtracted
  if(file.exists(path.data[1])){
    colnum <- ncol(read.csv(path.data[1]))
    processed.tmp <- read.csv(path.data[1],row.names=1,colClasses = c("character",rep("numeric",colnum-1)),check.names = F)
    processed.tmp <- construct.data(dat = processed.tmp,min.samples = min.samples,normalized = "processed", exclude.samples, exclude.probes)
    if(all(smp.names %in% rownames(processed.tmp)) & all(prb.names %in% colnames(processed.tmp))){
      processed <- processed.tmp
    }else{
      wtmp <- "rows and/or columns of processed data file don't match the raw data file. The anlaysis will use only raw data"
      warning(wtmp)
      paste(warnings.paragraph,wtmp)
    }
      
  }
  
  
  
  # Apply make.name to sample annotation variables
  if(length(sampleannot.variables)>0)
    sampleannot.variables=make.names(sampleannot.variables)
  
  
  
  # Load and construct sample annotation
  #-------------------------------------
  cat("loading sample annotation files",file=log,sep='\n\n',append=TRUE)
  annot <- read.csv(path.sampleannot,row.names=sample.annotation.id.column,colClasses="character",check.names = F)
  
  
  #   annot <- merge(x = data.frame(mrg = rownames(raw)),y = data.frame(mrg = rownames(annot),annot),by = "mrg",all.x = T)
  #   rownames(annot) <- annot$mrg ; annot$mrg <- "NULL"
  tmp <- construct.sample.annotation(smp.annot = annot, 
                                     smp.names = smp.names,
                                     sampleannot.variables = sampleannot.variables,
                                     sampleannot.variabletypes = sampleannot.variabletypes,
                                     sampleannot.referencelevels = sampleannot.referencelevels)
  annot <- tmp$smp.annot
  sampleannot.variabletypes <- tmp$sampleannot.variabletypes
  sampleannot.referencelevels <- tmp$sampleannot.referencelevels
  warnings.paragraph <- paste(warnings.paragraph,tmp$warnings.paragraph)
  
  # Load and construct probe annotation
  #------------------------------------
  
  construct.default.prb.annot <- is.null(path.probeannot)
  
  if(!construct.default.prb.annot){
    cat("loading probe annotation files",file=log,sep='\n\n',append=TRUE)
    
    tmp <- tryCatch(expr = {construct.probe.annotation(prb.annot = read.csv(path.probeannot,check.names = F,stringsAsFactors = F),
                                                       prb.names = prb.names,
                                                       prb.annotation.id.column = prb.annotation.id.column,
                                                       prb.set.column = prb.set.column,
                                                       prb.annotation.KEGG.id.column = prb.annotation.KEGG.id.column,
                                                       prb.annotation.KEGG.Gene.Name = prb.annotation.KEGG.Gene.Name,
                                                       prb.annotation.celltype.id.column = prb.annotation.celltype.id.column,
                                                       min.prb.set.size= min.prb.set.size,
                                                       min.KEGG.set.size = min.KEGG.set.size)}
                    , error = function(er){
                      ertmp <- paste("Provided probe annotations did not pass requirement check:\n",er)
                      cat(ertmp)
                      class(ertmp) <- "probe.annotation.check.error"
                      return(ertmp)
                    }
                    , finally ={})
    
    if(class(tmp) == "probe.annotation.check.error")
      construct.default.prb.annot <- TRUE
  }
  
  # if no probe annotation path is provided or if the provided annotation failed to meet the requirement
  # default annotation is inferred
  if(construct.default.prb.annot){
    cat("constructing default probe annotation files",file=log,sep='\n\n',append=TRUE)
    tmp <- list(prb.annot = get.default.pannot())
  }
  

  pannot <- tmp$prb.annot
  prb.set.matrix <- tmp$prb.set.matrix
  KEGG.set.matrix <- tmp$KEGG.set.matrix
  celltype.set.matrix <- tmp$celltype.set.matrix
  warnings.paragraph <- paste(warnings.paragraph,tmp$warnings.paragraph)
  
  suppressWarnings(write.table(pannot[1:5,1:5],file=log,sep='\t',append=TRUE))
  
  
  cat("loaded probe annotation",file=log,sep='\n\n',append=TRUE)
  cat("dimensions of pannot:",file=log,sep='\n',append=TRUE)
  cat(dim(pannot),file=log,sep='\n',append=TRUE)
  cat("dimensions of annot:",file=log,sep='\n',append=TRUE)
  cat(dim(annot),file=log,sep='\n',append=TRUE)
  cat("dimensions of raw:",file=log,sep='\n',append=TRUE)
  cat(dim(raw),file=log,sep='\n',append=TRUE)
  
  
  print("Loaded user files")  
  cat("LOG:Loaded user files",file=log,sep='\n\n',append=TRUE)
  
  print("Finished setup; starting annotations and data checking")
  cat("LOG:Finished setup; starting annotations and data checking",file=log,sep='\n\n',append=TRUE)
  
  
  # generate aesthetics for sample annotation
  #------------------------------------------
  smp.annot.aes <- annot.to.aes(annot = annot)
  
  # render the aesthetic legends
  #-----------------------------
  if (draw.color.legend)
    render.annot.aes.legends(annotcols = smp.annot.aes$annotcols,annotcols2 = smp.annot.aes$annotcols2,plottypearg = plottypearg,path.results = path.results)
  

  
  out <- list(rawdata=raw,
              processed = processed,
              backgroundSubtracted = backgroundSubtracted, 
              annot=annot,
              pannot=pannot,
              prb.set.matrix = prb.set.matrix,
              KEGG.set.matrix = KEGG.set.matrix,
              celltype.set.matrix = celltype.set.matrix,
              annotcols=smp.annot.aes$annotcols,
              annotcols2=smp.annot.aes$annotcols2,
              sampleannot.variables=colnames(annot),
              sampleannot.variabletypes=sampleannot.variabletypes,
              sampleannot.referencelevels=sampleannot.referencelevels,
              #prune=prune,
              warnings.paragraph=warnings.paragraph)
  
  return(out)
  
}


# construct a data matrix that meets the basic requirements of the analysis (i.e. is numeric, has more than 1 sample, and doesn't have duplicate observations)
# and optionally allows for additional filtering via definition of "filter" function
# @dat: nxp numeric data.frame or matrix
# @min.sample: smallest number of rows allowed
# @dat.type: "raw","background.subtracted","processed"
# @exclude.samples: a character vector of rownames to be excluded
# @excluce.probes: a character vector of colnames to be excluded
# @filter: optional function to which dat will be passed for additional modifications
# value:
# dat: nxp matrix that passes the basic requirements of the analysis. The attribute "preprocessing.status" is also set to raw or normalized based on flag
construct.data <- function(dat, 
                           min.samples = 1,
                           dat.type,
                           exclude.samples=NULL,
                           exclude.probes=NULL,
                           filter = function(x,...){return(x)},...){
  if(!(is.data.frame(dat) | is.matrix(dat)))
    stop("dat needs to be data.frame or matrix")
  
  if(any(exclude.probes %in% colnames(dat)))
    dat <- dat[,!exclude.probes,drop=F]
  if(any(exclude.samples %in% colnames(dat)))
    dat <- dat[!exclude.samples,,drop=F]
  
  #Drop columns with all NA <-------------this is intended to correct read.csv sometimes reading an extra empty column 
  dat <- dat[,!sapply(as.data.frame(dat),function(x)all(is.na(x))),drop=F]
  
  if(any(!sapply(dat,FUN = class) %in% c("numeric","integer")))
    stop("all columns of dat needs to numeric")
  
  # filter dat per specified filter
  dat <- filter(dat,...)
  
  dat <- as.matrix(dat)
  
  # If all dat==0 return NULL
  #--------------------------
  if(all(dat==0))
    return(NULL)
  
  
  # Exit duplicate sample row
  #------------------------------
  samplecor = cor(t(dat),use="pairwise.complete.obs")
  diag(samplecor) <- 0
  if(max(abs(samplecor))==1)
  {
    cat("document.write('<p>There are at least two samples with identical expression profiles.  Remove duplicate RCCs and re-run the analysis.</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE) 
    stop("There are at least two samples with identical expression profiles.  Remove duplicate RCCs and re-run the analysis.")       
  }
  
  
  # Exit if less than min sample size:
  #-----------------------------------
  if(nrow(dat)<min.samples)
    stop(paste("Less than",min.samples," samples were included in the analysis.  Re-run with multiple samples."))
  
  
  
  # set the data preprocessing.status
  #----------------------------------
  attr(dat,"preprocessing.status") <- dat.type
  #   if(normalized)
  #     attr(dat,"preprocessing.status") <-  "normalized"
  
  return(dat)
}

# construct probe annotation that meets the basic requirement of the analysis(i.e. is matrix or data.frame)
# @prb.annot: a matrix or a data.frame containing probe annotation information
# @prb.names: a character vector containing the probe names for which we have data. It ensures that the output has a row for each of the prb.names in order
# @prb.annotation.id.column: a character defining the name of the column containing unique probe ids
# @prb.set.column: NULL or a character string, the name of the column in prb.annot containing probe set information
# @prb.annotation.KEGG.id.column: a character. it is the name of the column with KEGG ids
# @prb.annotation.KEGG.Gene.Name: a character. It is the name of the column with the gene names to be used in KEGG pathway calls
# @prb.annotation.celltype.id.column: a character. It is the name of the column with the celltype info
# @min.prb.set.size: min number of probes needed to present in probeset for the probeset to be valid
# @min.KEGG.set.size: min number of probes present in the KEGG pathway
# @NA.handle.fun: a function that receives the prb.annotation matrix/data.frame and handles how to replace NA
# value:
# prb.annot: constructed annotation data.frame to meet the analysis requirements
# prb.set.matrix: a 0/1 matrix of probe membership in probesets. if prb.set.column is NULL, this will be NULL too
# KEGG.set.matrix: a 0/1 matrix of probe membership in KEGG pathways
# celltype.set.matrix: a 0/1 matrix of probe membership in cell type marker sets

construct.probe.annotation <- function(prb.annot,
                                       prb.names,
                                       prb.annotation.id.column,
                                       prb.set.column = NULL,
                                       prb.annotation.KEGG.id.column = NULL,
                                       prb.annotation.KEGG.Gene.Name = NULL,
                                       prb.annotation.celltype.id.column = NULL,
                                       min.prb.set.size,
                                       min.KEGG.set.size,
                                       NA.handle.fun = function(x,...){return(x)},
                                       ...){
  
  
  
  warnings.paragraph <- NULL
  required.names <- c(prb.annotation.id.column, "Probe.Label","Analyte.Type", "Is.Control","Control.Type")
  required.complete <- c(prb.annotation.id.column, "Probe.Label","Analyte.Type")
  missing.req.cols <- incomplete.cols <- character(length = 0)
  
  if(!(is.data.frame(prb.annot) | is.matrix(prb.annot)))
    stop("prb.annot needs to be data.frame or matrix")
  
  # cast as data.frame
  if(is.matrix(prb.annot))
    prb.annot <- data.frame(prb.annot,stringsAsFactors = F,check.names = F)
  
  if(!prb.annotation.id.column %in% colnames(prb.annot))
    stop("No probe ID column is provided in the probe annotations")
  
  if(any(sapply(prb.annot[prb.annotation.id.column],function(x) isTRUE(x =="") | is.na(x))))
    stop("Probe ID in probe annotations has missing elements")
  
  if(any(!required.names %in% colnames(prb.annot)))
    missing.req.cols <- required.names[!required.names %in% colnames(prb.annot)]

  
  if(sum(is.na(prb.annot[,required.complete]))!=0 | sum(prb.annot[,required.complete]=="")!=0)
    incomplete.cols <- required.complete[apply(prb.annot[,required.complete],MARGIN = 2,FUN = function(x) (sum(is.na(x)) + sum(x == ""))!=0 )]

  
  
  #Drop empty or NA columns:
  #This corrects read.csv sometimes reading an extra empty column
  #It also more broadly allows testing content availablity by colname
  #because if the colname is not there ==> content is not there!
  prb.annot <- prb.annot[,!sapply(as.data.frame(prb.annot),function(x)all(is.na(x)) | all(x=="")),drop=F]
  
  
  
  # If any of the required info is missing (after 
  # removing empty cols) bring them in from default
  #-------------------------------------------------
  cols.to.infer <- unique(c(missing.req.cols,incomplete.cols))
  if(length(cols.to.infer)>0){
    default.prb.annot <- get.default.pannot()
    default.prb.annot <- default.prb.annot[match(prb.annot[,prb.annotation.id.column],default.prb.annot[,prb.annotation.id.column]),]
    for(col.to.infer in cols.to.infer){
      prb.annot[[col.to.infer]] <- switch(class(default.prb.annot[,col.to.infer]),
                                          numeric = NA,
                                          character = "")
      prb.annot[,col.to.infer] <- default.prb.annot[,col.to.infer]
    }
  }
  
  
  
  
  # remove any annotations with NA as unique id
  prb.annot <- prb.annot[!is.na(prb.annot[,prb.annotation.id.column]), ]
  
  if(sum(duplicated(prb.annot[,prb.annotation.id.column]))>0)
    stop("probe ids are not unique")
  
  # Assign rownames to be the probe unique ids
  rownames(prb.annot) <- prb.annot[,prb.annotation.id.column]
  
  if(sum(rownames(prb.annot) %in% prb.names)== 0)
    stop("No match between the probe ids in the annotation and probe ids in the dataset was found")
  

  # keep only prb.annot for probes in prb.names
  tmp.types <- sapply(prb.annot,FUN = class)
  tmp <- as.data.frame(matrix(nrow = length(prb.names),ncol=ncol(prb.annot),dimnames = list(prb.names,colnames(prb.annot))))
  tmp <- data.frame(sapply(colnames(tmp),FUN = function(x) {y <- tmp[,x]; class(y) <- tmp.types[x]; return(y)},simplify = F),stringsAsFactors = F,row.names = prb.names)
  tmp[rownames(prb.annot)[rownames(prb.annot) %in% prb.names], ] <- prb.annot[rownames(prb.annot)[rownames(prb.annot) %in% prb.names],] 
  prb.annot <- tmp
  
  
  # Make probe labels unique within analyte by appending index
  # This is for the case of multiple probes of the same analyte
  # type having the same name (different probe ID) as in RLF merge
  # could happen
  #-----------------------------------------------------------
  for(analyte in unique(prb.annot$Analyte.Type)){
    prb.annot[prb.annot$Analyte.Type == analyte,"Probe.Label"] <- make.unique(prb.annot[prb.annot$Analyte.Type == analyte,"Probe.Label"])
  }
  
  
  # Check probeset info
  if(!isTRUE(prb.set.column %in% colnames(prb.annot))){
    print("Warning: No probe set annotation is provided")
    warnings.paragraph <- paste(warnings.paragraph,"Warning: No probe set annotation is provided\n")
    warning("colnames(prb.annot) needs to include prb.set.column")
    prb.set.matrix <- NULL
  }
      
  
  # Check KEGG id and Gene.Name
  if(!isTRUE(prb.annotation.KEGG.id.column %in% colnames(prb.annot))){
    print("Warning: No KEGG id annotation is provided")
    warnings.paragraph <- paste(warnings.paragraph,"Warning: No KEGG id annotation is provided\n")
    warning("colnames(prb.annot) needs to include prb.annotation.KEGG.id.column")
    prb.annotation.KEGG.id.column <- NULL
  }
  
  
  # Check KEGG id and Gene.Name
  if(!isTRUE(prb.annotation.KEGG.Gene.Name %in% colnames(prb.annot))){
    print("Warning: No KEGG gene name annotation is provided")
    warnings.paragraph <- paste(warnings.paragraph,"Warning: No KEGG gene name annotation is provided\n")
    warning("colnames(prb.annot) needs to include prb.annotation.KEGG.Gene.Name")
    prb.annotation.KEGG.Gene.Name <- NULL
  }
  
  # Check cell type annotation
  if(!isTRUE(prb.annotation.celltype.id.column %in% colnames(prb.annot))){
    print("Warning: No celltype probe annotation is provided")
    warnings.paragraph <- paste(warnings.paragraph,"Warning: No celltype probe annotation is provided\n")
    warning("colnames(prb.annot) needs to include prb.annotation.celltype.id.column")
    prb.annotation.celltype.id.column <- NULL
  }
  
  # Harmonize is.Control into 0/1 across ("","true"),(FALSE,TRUE)
  #--------------------------------------------------------------
  if("Is.Control" %in% colnames(prb.annot)){
    if(length(unique(prb.annot$Is.Control))!=2)
      stop("Is.Control column in probe annotation needs to have 2 unique levels/values")
    
    if(is.character(prb.annot$Is.Control))
      prb.annot$Is.Control <- as.numeric(prb.annot$Is.Control %in% c("true","TRUE"))
    prb.annot$Is.Control <- as.numeric(prb.annot$Is.Control)
    
  }
    
  
  # handle NA per specified NA.handle.func
  prb.annot <- NA.handle.fun(prb.annot,...)
  
  # # cast as data.frame
  # prb.annot <- as.data.frame(prb.annot)
  
  
  # construct the probeset matrix
  #------------------------------
  prb.set.matrix <- NULL
  if(!is.null(prb.set.column)){
    
    # #Eliminate illegal characters in prb.set.column
    # #Right now only + is eliminated. There may be more
    # #-------------------------------------------------
    # prb.annot[,prb.set.column] <- gsub(pattern = "\\+","\\.",prb.annot[,prb.set.column])
    
    #Swap illegal characters in prb.set.column with their legal char combo
    #---------------------------------------------------------------------
    prb.annot[,prb.set.column] <- swap.illegal.char(prb.annot[,prb.set.column])
    
    tmp <- tryCatch(expr = get.set.matrix(prb.annot = prb.annot,set.column.id = prb.set.column,min.set.size = min.prb.set.size,ensure.cases.insensitivity = TRUE),
                    error = function(er){
                      ertmp <- paste("Geneset information is discarded: the following error occurred\n",er)
                      cat(ertmp)
                      cat("document.write('<p> Geneset information is discarded: see summary for more info</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
                      class(ertmp) <- "error"
                      return(ertmp)
                    },
                    finally = {})
    if(class(tmp) == "error"){
      warnings.paragraph <- paste(warnings.paragraph,tmp)
    }else{
      warnings.paragraph <- paste(warnings.paragraph,tmp$warnings.paragraph)
      prb.set.matrix <- tmp$set.matrix
    }
  }
  
  KEGG.set.matrix <- NULL
  if(!is.null(prb.annotation.KEGG.id.column) & !is.null(prb.annotation.KEGG.Gene.Name)){
    
    
    tmp <- tryCatch(expr = get.set.matrix(prb.annot = prb.annot,set.column.id = prb.annotation.KEGG.id.column,min.set.size = min.KEGG.set.size,ensure.cases.insensitivity = FALSE),
                    error = function(er){
                      ertmp <- paste("KEGG information is discarded: the following error occurred\n",er)
                      cat(ertmp)
                      cat("document.write('<p> KEGG information is discarded: see summary for more info</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
                      class(ertmp) <- "error"
                      return(ertmp)
                    },
                    finally = {})
    
    if(class(tmp) == "error"){
      warnings.paragraph <- paste(warnings.paragraph,tmp)
    }else{
      warnings.paragraph <- paste(warnings.paragraph,tmp$warnings.paragraph)
      KEGG.set.matrix <- tmp$set.matrix
    } 
  }
  
  
  celltype.set.matrix <- NULL
  if(!is.null(prb.annotation.celltype.id.column)){
    tmp <- tryCatch(expr = get.set.matrix(prb.annot = prb.annot,set.column.id = prb.annotation.celltype.id.column,min.set.size = 0,ensure.cases.insensitivity = TRUE),
                    error = function(er){
                      ertmp <- paste("Cell Type information is discarded: the following error occurred\n",er)
                      cat(ertmp)
                      cat("document.write('<p> Cell Type information is discarded: see summary for more info</p>');", file=paste(path.inc,"//status.js",sep=""),append=TRUE)
                      class(ertmp) <- "error"
                      return(ertmp)
                    },
                    finally = {})
    
    if(class(tmp) == "error"){
      warnings.paragraph <- paste(warnings.paragraph,tmp)
    }else{
      warnings.paragraph <- paste(warnings.paragraph,tmp$warnings.paragraph)
      celltype.set.matrix <- tmp$set.matrix
    } 
  }
  
  
  return(list(prb.annot = prb.annot,
              prb.set.matrix = prb.set.matrix,
              KEGG.set.matrix = KEGG.set.matrix,
              celltype.set.matrix = celltype.set.matrix,
              warnings.paragraph=warnings.paragraph))
  
}



# Given the input sample annotation data.frame and desired list of variables and their specifications, it construct a valid sample annotation data.frame
# that meets the analysis requirement. Specifically it handles type casting the variables and removes variables that are constant.
# NOTE: no check on legality of names are performed. Only legal names are expected here.

# @smp.annot: a data.frame containing annotations for samples on variables of interest
# @smp.names: a character vector containing the sample names for which we had data (in constructed data object). It ensures the ouput has a row for each smp.names in order
# @sampleannot.variables: character vector of length m specifying the variables of interest. The elements need to be contained within the colnames of smp.annot
# @sampleannot.variabletypes: a character vector of length m describing the type of each sampleannot.variables element to be cast in. 
#                             It needs to consist of element "categorical","continuous" and "boolean"
# @sampleannot.referencelevels: a character vector of length m, containig the reference level for the vector sampleannot.variables. If sampleannot.variables is 
#                               "continuous", NA is the convention (although not necessary as any value will be ignored)
# values: 
# smp.annot: a data.frame smp.annot, constructed to meet the analysis requirement
# sampleannot.variabletypes: named character vector of variable types
# sampleannot.referencelevels: named character vector of reference levels for categorical variables or NA
# warnings.paragraph: this is a character string to be appended to warning paragraph

construct.sample.annotation <- function(smp.annot,
                                        smp.names,
                                        sampleannot.variables,
                                        sampleannot.variabletypes,
                                        sampleannot.referencelevels
                                        ){
  if(!is.data.frame(smp.annot))
    stop("smp.annot needs to be a data.frame")
  
  if(any(!sampleannot.variables %in% colnames(smp.annot)))
    stop("sampleannot.variables needs to be in colnames(smp.annot)")
  
  if(!is.character(smp.names))
    stop("smp.names need to be character vector")
  
  # Get only smp.annots for which we have samples and restrict the colnames to sampleannot.variables
  smp.annot <- smp.annot[rownames(smp.annot) %in% smp.names,sampleannot.variables,drop=F]
  tmp <- as.data.frame(matrix(nrow = length(smp.names),ncol =ncol(smp.annot),dimnames = list(smp.names,colnames(smp.annot))))
  tmp[rownames(smp.annot),] <-smp.annot
  smp.annot <- tmp
  
  # construct the smp.annot metadata
  #---------------------------------
  annot.vars <- data.frame(type = sampleannot.variabletypes, 
                           sampleannot.referencelevels,
                           row.names = sampleannot.variables,
                           stringsAsFactors = F)
  
  # type cast
  #----------
  vars.to.remove <- warnings.paragraph <- NULL
  for(v in sampleannot.variables){
    tmp <- tryCatch(expr = type.cast(x = smp.annot[,v],ref = annot.vars[v,"sampleannot.referencelevels"], type = annot.vars[v,"type"])
                    , error = function(er){
                      ertmp <- paste("Error in assigning variable",v,"as",annot.vars[v,"type"],"\n",er)
                      class(ertmp) <- "type.cast.error"
                      return(ertmp)
                    }
                    , warning = function(w){
                      wtmp <- paste("Warning in assigning variable type for",v,"as",annot.vars[v,"type"],"\n", w)
                      class(wtmp) <- "type.cast.warning"
                      return(wtmp)
                    }
                    ,finally ={})
    
    if(class(tmp) == "type.cast.error"){
      vars.to.remove <- c(vars.to.remove,v)
      warnings.paragraph <- paste(warnings.paragraph,tmp)
      next
    }
    if(class(tmp) == "type.cast.warning"){
      warnings.paragraph <- paste(warnings.paragraph,tmp)
      smp.annot[,v] <- suppressWarnings(type.cast(x = smp.annot[,v],ref = annot.vars[v,"sampleannot.referencelevels"], type = annot.vars[v,"type"]))
    }
    if(any(!class(tmp) %in% c("type.cast.error","type.cast.warning")))
      smp.annot[,v] <- tmp
  }
  
  
  # Flag single level varaibles for removal
  #----------------------------------------
  vars.nlevels <- sapply(smp.annot,function(x) nlevels(factor(x)) )[sampleannot.variables]
  if(sum(vars.nlevels==1)>0)
    warnings.paragraph <- paste(warnings.paragraph,"constant variable(s) are removed:",paste(names(vars.nlevels[vars.nlevels==1]),"\n",collapse = ",") )
  
  #   # Flag categorical variables with as many levels as nrow for removal
  #   #-------------------------------------------------------------------
  #   if(any(vars.nlevels[sampleannot.variabletypes[sampleannot.variabletypes == "categorical"]] == nrow(smp.annot)))
  #     warnings.paragraph <- paste(warnings.paragraph,"categorical variable(s) with as many levels as overall sample size are removed:\n",paste(names(vars.nlevels[vars.nlevels==1]),collapse = ",") )
  #   
  
  vars.to.remove <- union(vars.to.remove,names(vars.nlevels[vars.nlevels==1]))
  
  # Get final smp.annot
  #--------------------
  smp.annot <- smp.annot[,sampleannot.variables[!sampleannot.variables %in% vars.to.remove],drop=F]
  
  if(ncol(smp.annot)==0)
    stop("No viable sample annoation was provided, each sample annoation needs to have at least two distinict levels")
  
  # Reset sample annot variable type
  # NOTE: right now boolean is cast as categorical so it is not used
  #-----------------------------------------------------------------
  sampleannot.variabletypes <- c(factor = "categorical",numeric = "continuous",integer="continuous",logical = "boolean")[sapply(smp.annot,class)]
  names(sampleannot.variabletypes) <- colnames(smp.annot)
  
  # Reset sample reference
  #-----------------------
  sampleannot.referencelevels <- unlist(sapply(smp.annot,function(x) ifelse(is.factor(x),levels(x)[1],NA)))
  
  return(list(smp.annot = smp.annot,
              sampleannot.variabletypes = sampleannot.variabletypes,
              sampleannot.referencelevels = sampleannot.referencelevels,
              warnings.paragraph = warnings.paragraph))
}




# This function uses the "GeneAnnotations_additional.csv" file to build probe annotation file containing the following
# columns: "UniqueID","Probe.Label","Analyte.Type","Is.Control","Control.Type",  and if available "Control.Conc".
# Additionally it generates a default probeset including all probes
# @fl: the path to where nSolver outputs the default "GeneAnnotations_additional.csv"

get.default.pannot <- function(fl = "./GeneAnnotations_additional.csv"){
  require(stringr)
  
  control.names <- c("Positive","Negative","Housekeeping","Protein_NEG","Protein_CELLNORM","Invariant","RestrictionSite","SpikeIn","Ligation","Purification","Binding")
  
  pannot<- read.csv(fl,check.names = F,stringsAsFactors = F)
  UniqueID <- pannot$Probe.Name
  Probe.Label <- unlist(lapply(pannot$Gene.Name.Class.Name,function(x) paste(rev(rev(unlist(strsplit(x = x,split = "\\."),recursive = F))[-1]),collapse=".")))
  class.name <- unlist(lapply(pannot$Gene.Name.Class.Name,function(x) rev(unlist(strsplit(x,split = "\\.")))[1]))
  Is.Control <- as.numeric(class.name %in% control.names)
  Control.Type <- rep("",nrow(pannot));Control.Type[Is.Control==1] <- class.name[Is.Control==1]
  
  
  
  
  #protein
  prt.indices <- grep("^nProt_",UniqueID) # starts with nProt_
  prtCTRL.indices <- grep("^nPCtl_",UniqueID) # starts with nPCtl_
  
  #miRNA
  miRNA.indices <- grep("^nmiR[0-9]{5}\\.",UniqueID) #starts with nmiR followed by 5 numbers and followed by "."
  
  
  #Make everything not protein or miRNA into mRNA
  Analyte.Type <- rep("mRNA",nrow(pannot))
  Analyte.Type[unique(c(prt.indices,prtCTRL.indices))] <- "protein"
  Analyte.Type[miRNA.indices] <- "miRNA"
  
  #Distinguish between CNV and mRNA based on Assay type as both are type Endogenous
  Analyte.Type[pannot$Assay.Type == "CNV"] <- "DNA"
  
  
  prb.annot <- data.frame(ProbeID= UniqueID,
                          Probe.Label= Probe.Label,
                          Analyte.Type= Analyte.Type,
                          Is.Control=Is.Control,
                          Control.Type = Control.Type,
                          row.names = UniqueID,
                          check.names = F,
                          stringsAsFactors = F)
  
  # Make probe labels unique within analyte by appending index
  # This is for the case of multiple probes of the same analyte
  # type having the same name (different probe ID) as in RLF merge
  # could happen
  #-----------------------------------------------------------
  for(analyte in unique(prb.annot$Analyte.Type)){
    prb.annot[prb.annot$Analyte.Type == analyte,"Probe.Label"] <- make.unique(prb.annot[prb.annot$Analyte.Type == analyte,"Probe.Label"])
  }
  
  
  
  # Add the concentration column if info exists
  mRNA.control.indices <- grep("(^POS_[A-Z]{1}\\([0-9]+[\\.]*[0-9]*\\))|(^NEG_[A-Z]{1}\\([0-9]+[\\.]*[0-9]*\\))" ,prb.annot$Probe.Label)
  if(length(mRNA.control.indices) > 0){
    
    conc <- str_extract(string = prb.annot$Probe.Label[mRNA.control.indices],pattern = "\\([^()]+\\)")
    conc <- substring(conc, 2, nchar(conc)-1)
    prb.annot$Control.Conc <- NA
    prb.annot$Control.Conc[mRNA.control.indices] <- as.numeric(conc)
  }
  
  return(prb.annot)
  
}



# This function receives a vector x and cast it as defined by the character vector of the same length. For categorical variables where the desired reference is
# missing, and the values are digits, it approximate the reference level
# The supported types are "categorical" or "continuous" 
# @x: a vector of length m to be type cast
# @ref: a character designating the reference level for the vector x. If x is "continuous", NA is the convention (although not necessary as any value will be ignored)
# @type: a character describing the type the variable x to be cast in. It takes on values "categorical","continuous" or "boolean"
# value:
# It returns the variable in the specified type and with the correct reference in case of categorical type.

type.cast <- function(x,ref,type){
  
  # This fucntion is expected to only be used here and possibly might be eliminate, so kept
  # the code self-contained within the type.cast.  Future work could either eliminate or embed
  # the functionality within type.cast
  approximate.reference.level <- function(x,ref){
    
    ref0 = ref
    # if it looks like a numeric ref level:
    if(!is.na(as.numeric(ref0)))
    {
      # look for the same number amongst the levels of the variable:
      levels0 = as.numeric(as.character(unique(x)))
      closestlevel = order(abs(as.numeric(ref0) - levels0))[1]
      replacelevel = levels0[closestlevel]
      warning(paste("Assigned reference level",ref0,"had no match in the data.  It is being reassigned to the most similar observed level,",replacelevel))
      return(as.character(replacelevel))
    }
    # and if it's not a numeric reference level:
    if(is.na(as.numeric(ref0)))
    {
      stop(paste("Assigned reference level",ref0,"had no match in the data, and we were unable to guess which level should be used in its place"))
    }
    
  }
  
  if(type == "categorical"){
    if(ref %in% x)
      return(relevel(factor(x),ref = ref))
    #     warning(paste("The designated ref level",ref,"does not exist, a different level is assigned as reference"))
    #     return(factor(x))
    
    return(relevel(factor(x),ref = approximate.reference.level(x = x,ref = ref)))
  }
  
  if(type == "continuous")
    return(as.numeric(x))
  
  # Note we re caset boolean as categorical
  if(type == "boolean")
    return(as.factor(x))
}



# This function is intended for extracting the ";" seprated set names columns of probe annotation and making a matrix of
# probe x set 0/1 membership matrix
# @prb.annot: probe annotation data.frame
# @set.column.id: a character string. Name of the column from which the ";" content is to be extracted
# @min.set.size: minimum size of the set of probes within a set. At least one set needes to be larger than min.set.size and sets smaller than min.set.size are dropped.
# 
# @value
# set.matrix a 0/1 matrix of probes by set ids
# warnings.paragraph: warning texts to be passed back for logging purposes

get.set.matrix <- function(prb.annot,set.column.id, min.set.size = 0 ,ensure.cases.insensitivity = F){
  
  warnings.paragraph <- NULL
  set.membership <- lapply(as.character(prb.annot[,set.column.id]),FUN = function(x) unlist(strsplit(x = gsub(";[[:blank:]]*",";",x),split = ";")))
  set.names <- levels(factor(unlist(set.membership)))
  
  # check if any of the probsets differ only in letter case
  if(sum(duplicated(tolower(set.names)))>0 & ensure.cases.insensitivity){
    warnings.paragraph <- paste(warnings.paragraph,"Warning: Some names in", set.column.id,"differ only in lower/upper case letters used, they will be combined\n")
    require(reshape2)
    tmp <- melt(table(id = tolower(set.names),set.names))
    tmp <- tmp[tmp[,"value"]==1,c("id","set.names")]
    tmp2 <- tmp[!duplicated(tmp$id),]; tmp3 <- tmp2$set.names; names(tmp3) <- tmp2$id
    tmp4 <- data.frame(tmp,value = tmp3[tmp$id])
    map <- as.character(tmp4$value); names(map) <- tmp4$set.names
    set.membership <- lapply(as.character(prb.annot[,set.column.id]),FUN = function(x) unname(map[unlist(strsplit(x = gsub(";[[:blank:]]*",";",x),split = ";"))]))
    prb.annot[,set.column.id] <- unlist(lapply(set.membership,function(x)paste(x,collapse = ";")))
  }
  names(set.membership) <- rownames(prb.annot)
  
  #Ensure at least there is 1 set with larger than "min.set.size" member
  if(sum(min.set.size <= table(unlist(set.membership)))==0)
    stop("At least 1 set needs to have size equal or larger than", min.set.size)
  
  set.names <- levels(factor(unlist(set.membership)))
  set.matrix <- t(as.matrix(data.frame(lapply(set.membership,function(x) as.numeric(table(factor(x,levels = set.names))) ),row.names = set.names,check.names = F)))
  
  # Drop any probeset with fewer than min.set.size members
  too.few.prbs <- colnames(set.matrix)[colSums(set.matrix) < min.set.size]
  if(length(too.few.prbs)>0)
  {
    set.matrix <- set.matrix[,!colnames(set.matrix) %in% too.few.prbs]
    print(paste("Warning: the following probe sets hold less than",min.set.size,"probes and will be discarded:"))
    print(too.few.prbs)
    warnings.paragraph <- paste(warnings.paragraph,"Warning: the following probe sets hold less than",min.set.size,"probes and will be discarded:",paste(too.few.prbs,collapse=", "),"\n")        
  }
  return(list(set.matrix = set.matrix, warnings.paragraph = warnings.paragraph))
}




# Given an annotation data matrix, it translates the annot to aesthetic data.frame of the same structure
annot.to.aes <- function(annot){
  codecols2 = c("chartreuse3","darkgoldenrod2","deepskyblue2","firebrick2","darkorchid1","tan","slateblue3","forestgreen","coral","gray48",
                "aquamarine","bisque2","blue3","blueviolet","brown2","burlywood3","darkgoldenrod4","cyan3","cornflowerblue",
                "chocolate2","chartreuse1","darkorange","darkorchid4","darkred","darkslateblue","darkturquoise","deeppink1",
                "deeppink4","gray7","gold1","dodgerblue1")
  codecols2 = c(codecols2,colors())
  contcols = list()
  contcols[[1]] = c("deepskyblue2","grey","firebrick2")
  contcols[[2]] = c("forestgreen","grey","darkorange")
  contcols[[3]] = c("chartreuse","grey","gold1")
  contcols[[4]] = c("darkorchid1","grey","darkgoldenrod2")
  contcols[[5]] = c("darkslateblue","grey","chocolate2")
  for(i in 6:100){contcols[[i]]=contcols[[i-5]]}
  
  if(dim(as.data.frame(annot))[2]>0)
  {
    # matrix for coloring each observation by each variable:
    annotcols = matrix("white",dim(as.data.frame(annot))[1],dim(as.data.frame(annot))[2])
    # list of the unique colors for each annotation
    annotcols2 = list()
    dimnames(annotcols)[[1]] = dimnames(annot)[[1]]
    dimnames(annotcols)[[2]] = dimnames(annot)[[2]]
    colsused = 0; contcolsused=0
    for(i in dim(as.data.frame(annot))[2]:1)
    {
      # ID type of covariate
      if(class(annot[,i]) == "numeric")
      {
        annotcols[,i] = colorRampPalette(contcols[[1+contcolsused]])( 101 )[round((annot[,i]-min(annot[,i],na.rm=T))/(max(annot[,i],na.rm=T)-min(annot[,i],na.rm=T))*100+1,0)]
        annotcols2[[i]] = contcols[[1+contcolsused]]
        names(annotcols2[[i]]) = c("Low","Average","High")  
        names(annotcols2)[i] = dimnames(annot)[[2]][i]
        contcolsused = contcolsused+1
      }
      if(class(annot[,i]) == "factor")
      {
        annotcols[,i] = codecols2[(colsused+1):length(codecols2)][annot[,i]]
        annotcols2[[i]] = codecols2[(colsused+1):length(codecols2)][1:(length(unique(annot[,i]))-1*is.element(NA,annot[,i]))]  #<---- changed to not assign a color to missing elements
        names(annotcols2)[i] = dimnames(annot)[[2]][i]
        names(annotcols2[[i]]) = levels(annot[,i])  
        colsused = length(unique(annot[,i]))-1*is.element(NA,annot[,i])+colsused
      }
    }
  }
  return(list(annotcols = annotcols, annotcols2 = annotcols2))
}

# Given aes object of annotation it output legneds plots of the specified type in the specified path
render.annot.aes.legends <- function(annotcols, annotcols2,plottypearg, path.results){
  #   ### draw the color legend: for all variables at once:
  
  allcolors = alllegends = c()
  for(i in 1:length(annotcols2))
  {
    allcolors = c(allcolors,col=c(NA,annotcols2[[i]]),NA)
    alllegends = c(alllegends,c(main=names(annotcols2)[i],names(annotcols2[[i]]),NA))
  }
  for(r in 1:length(plottypearg)){
    plottype=plottypearg[r];
    maxwidth = max(nchar(alllegends))
    tempfilename = drawplot(filename=paste(path.results,"//color legend",sep=""),
                            plottype,width=max(.4,.1+maxwidth*.02),
                            height=1+.3*(length(allcolors)>20)+.3*(length(allcolors)>30))
    tempfilename=gsub(path.results,"results",tempfilename)
    par(mar=c(0,0,0,0))
    frame()
    legend("center",lty=1,lwd=12,col=allcolors,legend=alllegends)
    dev.off()}   
  
  
  
  ### draw the color legend: one for each variable:
  
  for(i in 1:length(annotcols2))
  {
    for(r in 1:length(plottypearg)){
      plottype=plottypearg[r];
      maxwidth = max(nchar(alllegends))
      tempfilename = drawplot(filename=paste(path.results,"//color legend - ",names(annotcols2)[i],sep=""),
                              plottype,width=max(.4,.1+maxwidth*.02),
                              height=1+.3*(length(annotcols2[[i]])>20)+.3*(length(annotcols2[[i]])>30))
      tempfilename=gsub(path.results,"results",tempfilename)
      par(mar=c(0,0,0,0))
      frame()
      legend("center",lty=1,lwd=12,col=c(NA,annotcols2[[i]]),legend=c(main=names(annotcols2)[i],names(annotcols2[[i]])))
      dev.off()}   
  }
  
}



#' This function receives a character vector and removes 
#' Illegal characters that break the analysis (either in html or plot saving)
#' NOTE: Decided against adding the 255 char limit check as that was very unlikely for geneset name
#' @x a character vator
 
swap.illegal.char <- function(x){
  char.2b.changed <- c(STR = "\\*",
                       FR  = "<",
                       TO  = ">",
                       CL  = "\\:",
                       TLD = "\\~",
                       BNG = "\\!",
                       AT  = "\\@",
                       DLR  = "\\$",
                       PRC = "\\%",
                       CR  = "\\^",
                       AP  = "\\`",
                       EQ  = "\\=",
                       CBL = "\\{",
                       CBR = "\\}",
                       BRL = "\\[",
                       BRR = "\\]",
                       PP = "\\|",
                       QZ = "\\?",
                       PLS = "\\+",
                       SQT = "\\'",
                       DQT = "\\\"",
                       FS = "\\/",
                       BS = "\\\\")
  
  # Test case
  # x <- "asdf * < > : ~ ! @ $ % ^ ` = { } [ ] | ? + ' \" / \\ "
  # x
  for(i in 1:length(char.2b.changed)){
    pt <- unname(char.2b.changed[i])
    rpl <- paste(".",names(char.2b.changed)[i],".",sep="")
    x <- gsub(pt,rpl,x = x)
    # cat(pt,"\n")
    # cat(x,"\n")
  }
  return(x)
}


#===========================================
#     get mRNA background from raw
#===========================================
estimate.log2.mRNA.BG.by.lane <- function(raw.dat,
                                          neg.colnames){
  # validation of input
  #--------------------
  if(is.data.frame(raw.dat)){
    numeric.cols <- apply(raw.dat,2,function(x) is.numeric(x))
    if(!all(numeric.cols)){
      warning("non-numeric mRNA raw.dat columns were dropped")
      raw.dat <- raw.dat[,numeric.cols]
    }
    raw.dat <- as.matrix(raw.dat)
  }
  
  
  if(!is.matrix(raw.dat))
    stop("raw.dat mRNA needs to be matrix")
  
  if(!all(neg.colnames %in% colnames(raw.dat)))
    stop("neg.colnames for mRNA data need to be in colnames(raw.dat)")
  
  # Simple Log normal model
  
  bg <- apply(log2(raw.dat[,neg.colnames]+1),1,mean) + qnorm(.975)*apply(log2(raw.dat[,neg.colnames]+1),1,sd)
  names(bg) <- rownames(raw.dat)
  return(bg)
}


#===========================================
#     get protein background from raw
#===========================================
estimate.log2.prot.BG.by.lane <- function(raw.dat,
                                          neg.colnames){
  get.orth.proj <- function(y,x,mod){
    er <- (y - (cbind(1,x) %*% coef(mod)) )/as.numeric(sqrt(t( c(1,-coef(mod)[2])  ) %*% c(1,-coef(mod)[2])))
    ort.proj <- cbind(x,y) - er %*% t(c(-coef(mod)[2],1)/sqrt(c(1,-coef(mod)[2])%*% c(1,-coef(mod)[2])))
    return(ort.proj)
  }
  
  # validation of input
  #--------------------
  if(is.data.frame(raw.dat)){
    numeric.cols <- apply(raw.dat,2,function(x) is.numeric(x))
    if(!all(numeric.cols)){
      warning("non-numeric protein raw.dat columns were dropped")
      raw.dat <- raw.dat[,numeric.cols]
    }
    raw.dat <- as.matrix(raw.dat)
  }
  
  
  if(!is.matrix(raw.dat))
    stop("raw.dat protein needs to be matrix")
  
  if(!all(neg.colnames %in% colnames(raw.dat)))
    stop("neg.colnames for protein data need to be in colnames(raw.dat)")
  
  if(length(neg.colnames)!=2)
    stop("for protein data, two neg.colnames are needed to estimate varaibility in lane background counts")
  
  #log2 tranform
  prot <- log2(raw.dat + 1)
  
  
  #=============================================
  # Find background (via deming reg model)
  #=============================================
  
  # Here we use the deming package with stdpat(1,0,1,0), which is the classical deming, 
  # note if the ratio of SDs are to be changed via stdpat, pay attention to scaling of sigma accordingly
  # when making CI for the IgGs
  
  require(deming)
  dmod <- deming(prot[,neg.colnames[2]]~prot[,neg.colnames[1]],stdpat=c(1,0,1,0))
  sigma <- dmod$sigma
  
  alpha <- atan(coef(dmod)[2])
  var.y.on.line <- sigma^2 * sin(alpha)
  var.x.on.line <- sigma^2 * coef(dmod)[2]
  var.on.line <- var.x.on.line + var.y.on.line 
  
  background.stats <- data.frame(prot[,neg.colnames])
  background.stats[[make.names(paste(neg.colnames[2],"hat",sep="."))]] <- cbind(1,prot[,neg.colnames[1]]) %*% dmod$coefficients
  background.stats[["UL"]] <- background.stats[[make.names(paste(neg.colnames[2],"hat",sep="."))]] +
    qnorm(.975)* sqrt(var.on.line * sin(alpha))
  background.stats[["LL"]] <- background.stats[[make.names(paste(neg.colnames[2],"hat",sep="."))]] -
    qnorm(.975)* sqrt(var.on.line * sin(alpha))
  background.stats[["out"]] <- prot[,neg.colnames[2]]<background.stats[["LL"]]| prot[,neg.colnames[2]]>background.stats[["UL"]]
  
  
  # Note: currently the estimate of the variability in background, sigma, 
  #       is taken wihtout excluding the outliers. The plot below shows the
  #       difference in trend after excluding the outliers (for eval purposes)
  visualize <- F
  if(visualize){
    plot(prot[,neg.colnames[1]],prot[,neg.colnames[2]],col = 1+background.stats[["out"]],pch = c(1,19)[1+background.stats[["out"]]],
         main = "Deming regression of negatives", xlab =neg.colnames[1], ylab =neg.colnames[2])
    abline(coef = dmod$coefficients,lty=1,col=1)
    abline(coef = deming(prot[!background.stats[["out"]],neg.colnames[2]]~prot[!background.stats[["out"]],neg.colnames[1]],stdpat=c(1,0,1,0))$coefficients,lty=2,col=2)
    abline(coef = coef(lm(prot[,neg.colnames[2]]~prot[,neg.colnames[1]])),lty=3,col=3)
    abline(coef = c(mean(prot[,neg.colnames[2]]-prot[,neg.colnames[1]]),1),lty=4,col=4)
    legend("bottomright",legend = c("Deming","Deming w/o outliers","OLS","Best fit slope=1"),col = 1:4,lty = 1:4)
    
  }
  
  orth.proj <- get.orth.proj(y = prot[,neg.colnames[2]],x = prot[,neg.colnames[1]],mod = dmod)
  
  # Here we get a representation of the "mean" of the two negatives along with the CI around this est.
  background.stats[["orth.proj.bar"]] <- apply(orth.proj,1,mean) 
  background.stats[["opbar.UL"]] <- background.stats[["orth.proj.bar"]] + qnorm(.975)* sqrt( (var.on.line * sin(alpha)+ var.on.line * cos(alpha))/2 )
  background.stats[["opbar.LL"]] <- background.stats[["orth.proj.bar"]] + qnorm(.025)* sqrt( (var.on.line * sin(alpha)+ var.on.line * cos(alpha))/2 )
  
  # The upper limit of the orthogonal projection is chosen as the threshold for background
  background.stats[["background.threshold"]] <- background.stats[["opbar.UL"]]
  
  bg <- background.stats[rownames(prot),"background.threshold"]
  names(bg) <- rownames(raw.dat)
  return(bg)
}



# thresholding.frequency = 0.5
# thresholding.value = 20
# thresholding.perform = TRUE

filter.out.low.counts <- function(raw, prb.annots,thresholding.frequency, thresholding.value,thresholding.perform){
  warnings <- NULL
  pruneflag <- FALSE
  prune <- rep(FALSE,ncol(raw)); names(prune) <- colnames(raw)
  pruned.probes <-  character(0)
  if(attr(raw,"preprocessing.status") != "raw" | !thresholding.perform){
    if(!thresholding.perform){
      warnings<- "Warning: raw data was not provided for filtering out low count genes - retaining all genes\n"
      warning(warnings)
    }

    return(list(pruneflag = pruneflag, prune = prune,pruned.probes= pruned.probes,warnings=warnings))
  }
  
  
  # Get background if AUTO
  #-----------------------
  for(analyte in names(thresholding.frequency)){
    analyte.prbs <- prb.annots[prb.annots$Analyte.Type==analyte,"ProbeID"]
    
    
    if(thresholding.value[[analyte]] == "AUTO"){
      
      #Default value (i.e. no threshold)
      thresholding.value[[analyte]] <- apply(raw[,analyte.prbs],1,min)
      
      if(analyte == "mRNA"){
        neg.colnames <- rownames(prb.annots)[prb.annots$Control.Type == "Negative"]
        if(length(neg.colnames)>0)
          thresholding.value[[analyte]] <- 2^(estimate.log2.mRNA.BG.by.lane(raw.dat = raw,neg.colnames = neg.colnames))
      }
        
      if(analyte == "protein"){
        neg.colnames <- rownames(prb.annots)[prb.annots$Control.Type == "Protein_NEG"]
        if(length(neg.colnames)>0)
          thresholding.value[[analyte]] <- 2^(estimate.log2.prot.BG.by.lane(raw.dat = raw,neg.colnames = neg.colnames))
      }
        
    }else{
      thresholding.value[[analyte]] <- rep(thresholding.value[[analyte]],nrow(raw))
      names(thresholding.value[[analyte]]) <- rownames(raw)
    }
    
    if(thresholding.frequency[[analyte]] == "AUTO")
      thresholding.frequency[[analyte]] <- .5
    
    
    
    prune[analyte.prbs] <- apply(sweep(raw[,analyte.prbs],MARGIN = 1,STATS = thresholding.value[[analyte]],FUN = "<"),2,mean,na.rm=T)>thresholding.frequency[[analyte]]
    
  }
  
  #prune <- apply((raw<thresholding.value),2,mean,na.rm=T)>thresholding.frequency
  prune[is.na(prune)] <- TRUE
  
  if(mean(prune)>0.8)
  {
    pruneflag <- FALSE
    warning("Warning: More than 80% of genes were pruned using these cutoffs - retaining all genes")
    cat("LOG:Warning: More than 80% of genes were pruned using these cutoffs - retaining all genes",file=log,sep='\n',append=TRUE)
    warnings <- paste(warnings,"Warning: More than 80% of genes were pruned using the given cutoffs - retaining all genes","\n")
    prune[prune] <- FALSE
  }
  
  pruned.probes <- suppressWarnings(as.matrix(dimnames(raw)[[2]][prune]))
  colnames(pruned.probes) <- paste("The below genes were removed for falling below the background level too frequently")
  write.csv(as.matrix(pruned.probes),file="genes removed for low signal.csv",row.names=FALSE)
  return(list(pruneflag = pruneflag, prune = prune,pruned.probes= pruned.probes,warnings=warnings))
  
}

#===============
# test case
#===============

test.construct.data.and.annotations <- function(path = "S:/AMashadi-Hossein/Take2/2015_11_02/nsolverDev/JohnBucciCode/nSolver_advanced_local_NEW"){
  setwd(dir = path)
  #draw plot function
  drawplot = function(filename,type = c("png","pdf","tiff","jpg","bmp"),width = 1, height = 1,heatmapres=FALSE)
  {
    # modify filename to remove illegal characters:
    illegalcharacters = c("<",">",":","\"","\\?","\\|","\\*")  #
    # only work on the later part of the filename after the "C://":
    filename1 = substr(filename,1,4)
    filename2 = substr(filename,5,length(strsplit(filename,split="")[[1]]))
    for(i in 1:length(illegalcharacters))
    {
      filename2 = gsub(illegalcharacters[i],"_",filename2);  #print(illegalcharacters[i]); print(filename2)
    }
    filename = paste(filename1,filename2,sep="")
    # modify resolution for heatmaps:
    res = NA; pointsize = 12
    if(heatmapres){res = 200;pointsize = 7}
    
    # run plotting function:
    if(type=="png")
    {
      png(filename=paste(filename,type,sep="."),width=480*width,height=480*height,res=res,pointsize=pointsize)
    }
    if(type=="jpg")
    {
      jpeg(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
    }
    if(type=="bmp")
    {
      bmp(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
    }
    if(type=="pdf")
    {
      pdf(file=paste(filename,type,sep="."),width=7*width,height=7*height)
    }
    if(type=="tiff")
    {
      tiff(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
    }
    return(filename) 
  }
  g <- globalenv()
  assign(x = "data.type",value = "raw",envir = g)
  assign(x = "log",value = "output/log_file.txt", envir = g)
  g$drawplot <- drawplot
  
  #   data.type <- "raw"
  #   log = "log_file.txt"
  
  path.results <- paste(path,"/output",sep="")
  assign("path.results",value = path.results,envir = g)
  dir.create(path = path.results,showWarnings = F)
  
  
  #####
  ##### edited by jbucci 12/1/2015
  #####
  
  path.data <- c("ProcessedData_12012015.csv",
                 "RawData_12012015.csv",
                 "BackgroundSubtractedValues_12012015.csv")
  path.sampleannot <- "SampleAnnotations_20151214.csv"
  path.probeannot <- "ProbeAnnotations_12012015.csv"  #"GeneAnnotations.csv"
  prb.annotation.id.column <- "probe.ID" #"Gene.Name"
  prb.set.column <- "prb.sets.a" #"Immune.Response.Category"
  min.prb.set.size <- 3
  sample.annotation.id.column <- "smp.ID"  
  sampleannot.variabletypes = c("categorical","continuous","categorical","categorical")
  sampleannot.variables = c("smp.names", "t","trt","cell.type")
  sampleannot.referencelevels = c("Jurkat 100K_50K-1",NA,"no","Jurkat") 
  
  exclude.probes = NULL    #<-------------- This could be ERCC by default
  exclude.samples = NULL   #<-------------- This could be NULL by default
  draw.color.legend = T
  min.samples = 1
  plottypearg = "pdf"
  
  
  
  out <- construct.data.and.annotations(path.data,
                                        path.sampleannot,
                                        path.probeannot,
                                        prb.annotation.id.column, # <------------ This needs to become static as opposed to a func input
                                        prb.set.column,
                                        min.prb.set.size,
                                        sample.annotation.id.column,   
                                        sampleannot.variables,
                                        sampleannot.variabletypes,
                                        sampleannot.referencelevels,
                                        exclude.probes = NULL,    #<-------------- This could be ERCC by default
                                        exclude.samples = NULL,   #<-------------- This could be NULL by default
                                        draw.color.legend,
                                        min.samples,
                                        plottypearg)
  return(out)
  
}

#test.results <-test.construct.data.and.annotations()
#cat(test.results$warnings.paragraph)


# test.sample.annotation.processing <- function(){
#   #-----------------------------------------
#   drawplot = function(filename,type = c("png","pdf","tiff","jpg","bmp"),width = 1, height = 1,heatmapres=FALSE)
#   {
#     # modify filename to remove illegal characters:
#     illegalcharacters = c("<",">",":","\"","\\?","\\|","\\*")  #
#     # only work on the later part of the filename after the "C://":
#     filename1 = substr(filename,1,4)
#     filename2 = substr(filename,5,length(strsplit(filename,split="")[[1]]))
#     for(i in 1:length(illegalcharacters))
#     {
#       filename2 = gsub(illegalcharacters[i],"_",filename2);  #print(illegalcharacters[i]); print(filename2)
#     }
#     filename = paste(filename1,filename2,sep="")
#     # modify resolution for heatmaps:
#     res = NA; pointsize = 12
#     if(heatmapres){res = 200;pointsize = 7}
#     
#     # run plotting function:
#     if(type=="png")
#     {
#       png(filename=paste(filename,type,sep="."),width=480*width,height=480*height,res=res,pointsize=pointsize)
#     }
#     if(type=="jpg")
#     {
#       jpeg(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
#     }
#     if(type=="bmp")
#     {
#       bmp(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
#     }
#     if(type=="pdf")
#     {
#       pdf(file=paste(filename,type,sep="."),width=7*width,height=7*height)
#     }
#     if(type=="tiff")
#     {
#       tiff(filename=paste(filename,type,sep="."),width=480*width,height=480*height)
#     }
#     return(filename) 
#   }
#   #-----------------------------------------
#   
#   sample.annotation.id.column = "File.Name"
#   smp.annot <- read.csv("S:/AMashadi-Hossein/Take2/2015_11_02/nsolverDev/SampleAnnotations.csv",row.names=sample.annotation.id.column,colClasses="character") 
#   smp.names <- rownames(smp.annot)
#   smp.annot <- smp.annot[-1,]
#   sampleannot.variabletypes = c("categorical", "categorical","continuous","categorical","continuous","continuous","categorical")
#   sampleannot.variables = c("Subtype.Call", "BMI.Cat","BMI", "constant.cat","constant.continuous","warn.continuous","cat.ref.1p2.replaced_w_1p3")
#   sampleannot.referencelevels = c("LuminalB", "1",NA,"const",NA,NA,"1.2") 
#   
#   tmp.annot <- construct.sample.annotation(smp.annot,sampleannot.variables,sampleannot.variabletypes,sampleannot.referencelevels)
#   tmp.aes <- annot.to.aes(annot = tmp.annot$smp.annot)
#   render.annot.aes.legends(annotcols = tmp.aes$annotcols,annotcols2 = tmp.aes$annotcols2,plottypearg = "pdf",path.results = "S:/AMashadi-Hossein/Take2/2015_11_02/nsolverDev/")
# }
# 
# 
# test.data.cleanup <-function(){
#   dat <- read.csv("S:/AMashadi-Hossein/Take2/2015_11_02/nsolverDev/RawData.csv",row.names = 1);dat<-dat[,-ncol(dat)]
# }
# 
